# For ML models
from sklearn.linear_model import LinearRegression ,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC ,SVR
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
os.chdir(r"C:\Users\zkr24\Desktop")
df = pd.read_csv(r"C:\Users\zkr24\Desktop\heart_2020_cleaned.csv")
df.head()
| HeartDisease | BMI | Smoking | AlcoholDrinking | Stroke | PhysicalHealth | MentalHealth | DiffWalking | Sex | AgeCategory | Race | Diabetic | PhysicalActivity | GenHealth | SleepTime | Asthma | KidneyDisease | SkinCancer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | No | 16.60 | Yes | No | No | 3.0 | 30.0 | No | Female | 55-59 | White | Yes | Yes | Very good | 5.0 | Yes | No | Yes |
| 1 | No | 20.34 | No | No | Yes | 0.0 | 0.0 | No | Female | 80 or older | White | No | Yes | Very good | 7.0 | No | No | No |
| 2 | No | 26.58 | Yes | No | No | 20.0 | 30.0 | No | Male | 65-69 | White | Yes | Yes | Fair | 8.0 | Yes | No | No |
| 3 | No | 24.21 | No | No | No | 0.0 | 0.0 | No | Female | 75-79 | White | No | No | Good | 6.0 | No | No | Yes |
| 4 | No | 23.71 | No | No | No | 28.0 | 0.0 | Yes | Female | 40-44 | White | No | Yes | Very good | 8.0 | No | No | No |
df.shape
(319795, 18)
we have roughly 300k observations and 18 columns in the dataset.
df.isna().sum()
HeartDisease 0 BMI 0 Smoking 0 AlcoholDrinking 0 Stroke 0 PhysicalHealth 0 MentalHealth 0 DiffWalking 0 Sex 0 AgeCategory 0 Race 0 Diabetic 0 PhysicalActivity 0 GenHealth 0 SleepTime 0 Asthma 0 KidneyDisease 0 SkinCancer 0 dtype: int64
this is a cleaned dataset with no null values
df.describe()
| BMI | PhysicalHealth | MentalHealth | SleepTime | |
|---|---|---|---|---|
| count | 319795.000000 | 319795.00000 | 319795.000000 | 319795.000000 |
| mean | 28.325399 | 3.37171 | 3.898366 | 7.097075 |
| std | 6.356100 | 7.95085 | 7.955235 | 1.436007 |
| min | 12.020000 | 0.00000 | 0.000000 | 1.000000 |
| 25% | 24.030000 | 0.00000 | 0.000000 | 6.000000 |
| 50% | 27.340000 | 0.00000 | 0.000000 | 7.000000 |
| 75% | 31.420000 | 2.00000 | 3.000000 | 8.000000 |
| max | 94.850000 | 30.00000 | 30.000000 | 24.000000 |
we do see some unusual values such as a BMI of 94.85, or sleep time of 24 hours. We might need to drop those variables in the data preprocessing phase.
# Calculate percentage of Yes and No values
counts = df['HeartDisease'].value_counts(normalize=True) * 100
counts
No 91.440454 Yes 8.559546 Name: HeartDisease, dtype: float64
import plotly.express as px
# Calculate percentage of Yes and No values
counts = df['HeartDisease'].value_counts(normalize=True) * 100
# Create pie chart using Plotly
fig = px.pie(counts, values=counts, names=counts.index,
hole=0.5, color=counts.index, color_discrete_map={'Yes':'#4285f4', 'No':'#ea4335'})
# Add labels and format percentage values
fig.update_traces(textposition='inside', textinfo='percent+label')
# Add title and show plot
fig.update_layout(title='Yes/No Chart', width=400, height=400)
fig.show()
from collections import Counter
Counter(df['HeartDisease'])
Counter({'No': 292422, 'Yes': 27373})
as shown in the pie chart, the dataset is quite imbalanced with roughly 9 percent yes and 91 percent no. We need to balance yes/no for better modeling result.
sns.set(style="darkgrid")
sns.set(rc={'figure.figsize':(20,12)})
# creating a figure composed of 2 matplotlib.Axes objects
f, ax_hist = plt.subplots(1, sharex=True, gridspec_kw={"height_ratios": (.85,)})
colours = ['#4285f4', '#ea4335', '#fbbc05', '#34a853']
sns.histplot(df[df['HeartDisease']=='Yes'], x="BMI", ax=ax_hist, kde=True, color="#ea4335")
sns.histplot(df[df['HeartDisease']=='No'], x="BMI", ax=ax_hist, kde=True, color='#4285f4')
plt.legend(title='', loc=2, labels=['Heart Disease', 'No HeartDisease'],bbox_to_anchor=(1.02, 1), borderaxespad=0.)
plt.show()
the BMI for people who have heart disease are higher than those who do not.
df['AgeCategory'].unique()
array(['55-59', '80 or older', '65-69', '75-79', '40-44', '70-74',
'60-64', '50-54', '45-49', '18-24', '35-39', '30-34', '25-29'],
dtype=object)
category_mapping = {"18-24": 21, "25-29": 27, "30-34": 32, "35-39": 37, "40-44": 42, "45-49": 47,"50-54":52, "55-59": 57, "60-64":62,"65-69":67,
"70-74":72,"75-79":77,"80 or older":80}
df["AgeCategoryNumeric"] = df["AgeCategory"].map(category_mapping)
fig, ax = plt.subplots(figsize = (12,5))
sns.kdeplot(df[df["HeartDisease"]=='Yes']["AgeCategoryNumeric"], alpha=1,shade = False, color="#ea4335", label="HeartDisease", ax = ax)
sns.kdeplot(df[df["KidneyDisease"]=='Yes']["AgeCategoryNumeric"], alpha=1,shade = False, color="#4285f4", label="KidneyDisease", ax = ax)
sns.kdeplot(df[df["SkinCancer"]=='Yes']["AgeCategoryNumeric"], alpha=1,shade = False, color="#fbbc05", label="SkinCancer", ax = ax)
ax.set_xlabel("AgeCategory")
ax.set_ylabel("Frequency")
ax.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.)
plt.show()
In general, people are more likely to get diseases when they are getting older.
df = df.drop(columns = 'AgeCategoryNumeric')
import plotly.express as px
import plotly.graph_objects as go
# Define a list of categorical columns to plot
cat_cols = ["Smoking", "AlcoholDrinking", "DiffWalking", "Sex", "Diabetic", "PhysicalActivity", "Asthma", "Stroke", "GenHealth"]
# Define a color scale for the bars
color_scale = px.colors.qualitative.Dark24
# Create a figure with subplots for each categorical column
fig = make_subplots(rows=3, cols=3, subplot_titles=cat_cols)
# Iterate over each categorical column and create a barplot
for i, col in enumerate(cat_cols):
row = i // 3 + 1
col2 = i % 3 + 1
probabilities = df.groupby(col)["HeartDisease"].value_counts(normalize=True).unstack()["Yes"]
# Calculate the percentages
percentages = probabilities * 100
# Create a bar trace with a different color for each bar and the percentages as text
bar_trace = go.Bar(
x=probabilities.index,
y=probabilities.values,
name="Probability of Heart Attack",
marker=dict(color=color_scale),
texttemplate='%{y:.2f}%',
text=percentages.values,
textposition='auto'
)
fig.add_trace(bar_trace, row=row, col=col2)
# Update the layout
fig.update_layout(
title="Probability of Heart Attack by Categorical Variables",
xaxis_title="Categorical Variables",
yaxis_title="Probability of Heart Attack",
font=dict(size=14),
height=800,
width=1200,
showlegend=False
)
fig.show()
df.describe()
| BMI | PhysicalHealth | MentalHealth | SleepTime | |
|---|---|---|---|---|
| count | 319795.000000 | 319795.00000 | 319795.000000 | 319795.000000 |
| mean | 28.325399 | 3.37171 | 3.898366 | 7.097075 |
| std | 6.356100 | 7.95085 | 7.955235 | 1.436007 |
| min | 12.020000 | 0.00000 | 0.000000 | 1.000000 |
| 25% | 24.030000 | 0.00000 | 0.000000 | 6.000000 |
| 50% | 27.340000 | 0.00000 | 0.000000 | 7.000000 |
| 75% | 31.420000 | 2.00000 | 3.000000 | 8.000000 |
| max | 94.850000 | 30.00000 | 30.000000 | 24.000000 |
I notice that there are some duplicates in the dataset, so i drop them to prevent overfitting of model
df.shape
(319795, 18)
319795 rows for the original dataset
df = df.drop_duplicates()
df.shape
(301717, 18)
after dropping duplicates, there are 301717 rows
Drop unnecessary columns such as race
df.head()
| HeartDisease | BMI | Smoking | AlcoholDrinking | Stroke | PhysicalHealth | MentalHealth | DiffWalking | Sex | AgeCategory | Race | Diabetic | PhysicalActivity | GenHealth | SleepTime | Asthma | KidneyDisease | SkinCancer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | No | 16.60 | Yes | No | No | 3.0 | 30.0 | No | Female | 55-59 | White | Yes | Yes | Very good | 5.0 | Yes | No | Yes |
| 1 | No | 20.34 | No | No | Yes | 0.0 | 0.0 | No | Female | 80 or older | White | No | Yes | Very good | 7.0 | No | No | No |
| 2 | No | 26.58 | Yes | No | No | 20.0 | 30.0 | No | Male | 65-69 | White | Yes | Yes | Fair | 8.0 | Yes | No | No |
| 3 | No | 24.21 | No | No | No | 0.0 | 0.0 | No | Female | 75-79 | White | No | No | Good | 6.0 | No | No | Yes |
| 4 | No | 23.71 | No | No | No | 28.0 | 0.0 | Yes | Female | 40-44 | White | No | Yes | Very good | 8.0 | No | No | No |
df['GenHealth'].unique()
array(['Very good', 'Fair', 'Good', 'Poor', 'Excellent'], dtype=object)
def helper_Health(x):
if x == "Poor": return 0
elif x == "Fair": return 1
elif x == "Good": return 2
elif x == "Very Good": return 3
else:
return 4
df['GenHealth'] = df['GenHealth'].apply(helper_Health)
df['AgeCategory'].unique()
array(['55-59', '80 or older', '65-69', '75-79', '40-44', '70-74',
'60-64', '50-54', '45-49', '18-24', '35-39', '30-34', '25-29'],
dtype=object)
from sklearn.preprocessing import OrdinalEncoder
# create an instance of the OrdinalEncoder
encoder = OrdinalEncoder()
# fit the encoder on the color column of the dataset
encoder.fit(df[['AgeCategory']])
# transform the color column using the fitted encoder
df['Age_Ordinal'] = encoder.transform(df[['AgeCategory']])
# print the encoded dataset
print(df['Age_Ordinal'].unique())
[ 7. 12. 9. 11. 4. 10. 8. 6. 5. 0. 3. 2. 1.]
df = df.drop(columns = ['AgeCategory'])
df['Diabetic'].unique()
array(['Yes', 'No', 'No, borderline diabetes', 'Yes (during pregnancy)'],
dtype=object)
def helper_Diabetic(x):
if x == "Yes": return "Yes"
elif x == "No": return "No"
elif x == "No, borderline diabetes": return "No"
else:
return "Yes"
df['Diabetic'] = df['Diabetic'].apply(helper_Diabetic)
df['Diabetic'].unique()
array(['Yes', 'No'], dtype=object)
def helper_Heart(x):
if x == "Yes": return 1
else:
return 0
df['HeartDisease'] = df['HeartDisease'].apply(helper_Heart)
df['HeartDisease'].unique()
array([0, 1], dtype=int64)
# perform one hot encoding on the categorical columns
data = pd.get_dummies(df, columns=['Smoking', 'AlcoholDrinking','Stroke','DiffWalking','Sex',
'Race','Diabetic','PhysicalActivity','Asthma','KidneyDisease','SkinCancer'],drop_first=True)
# print the encoded dataset
data.head()
| HeartDisease | BMI | PhysicalHealth | MentalHealth | GenHealth | SleepTime | Age_Ordinal | Smoking_Yes | AlcoholDrinking_Yes | Stroke_Yes | ... | Race_Asian | Race_Black | Race_Hispanic | Race_Other | Race_White | Diabetic_Yes | PhysicalActivity_Yes | Asthma_Yes | KidneyDisease_Yes | SkinCancer_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 16.60 | 3.0 | 30.0 | 4 | 5.0 | 7.0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1 |
| 1 | 0 | 20.34 | 0.0 | 0.0 | 4 | 7.0 | 12.0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 2 | 0 | 26.58 | 20.0 | 30.0 | 1 | 8.0 | 9.0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 |
| 3 | 0 | 24.21 | 0.0 | 0.0 | 2 | 6.0 | 11.0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 4 | 0 | 23.71 | 28.0 | 0.0 | 4 | 8.0 | 4.0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
5 rows × 22 columns
Check Correlation for the variables
plt.figure(figsize = (30,10))
sns.heatmap(data.corr(),annot=True,fmt='.1f')
<AxesSubplot: >
no variables seem highly correlated, so we keep all of them
train test split
cols = data.columns[1:]
cols
Index(['BMI', 'PhysicalHealth', 'MentalHealth', 'GenHealth', 'SleepTime',
'Age_Ordinal', 'Smoking_Yes', 'AlcoholDrinking_Yes', 'Stroke_Yes',
'DiffWalking_Yes', 'Sex_Male', 'Race_Asian', 'Race_Black',
'Race_Hispanic', 'Race_Other', 'Race_White', 'Diabetic_Yes',
'PhysicalActivity_Yes', 'Asthma_Yes', 'KidneyDisease_Yes',
'SkinCancer_Yes'],
dtype='object')
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(data[cols], data['HeartDisease'], train_size=2/3, random_state = 0)
# count the occurrences of each category
counts = data['HeartDisease'].value_counts()
# calculate the percentage of each category
percentages = counts * 100 / sum(counts)
print(percentages)
0 90.964712 1 9.035288 Name: HeartDisease, dtype: float64
Since the dataset is extremely imbalanced, we need to balance the dataset before running the model. Here I use SMOTE method for oversampling.
from imblearn.combine import SMOTEENN
from collections import Counter
counter = Counter(y_train)
print('Before', counter)
# oversampling the train dataset using SMOTE + ENN
smenn = SMOTEENN()
X_train_smenn, y_train_smenn = smenn.fit_resample(X_train, y_train)
counter = Counter(y_train_smenn)
print('After', counter)
Before Counter({0: 183021, 1: 18123})
After Counter({1: 168272, 0: 130363})
standardize
from sklearn.preprocessing import StandardScaler
#Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler().fit(X_train_smenn)
#transform the data such that its distribution will have a mean value 0 and standard deviation of 1
X_train_smenn_scaled = scaler.transform(X_train_smenn)
X_test_scaled = scaler.transform(X_test)
apply models
# Importing Classifier Modules
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
Models = [
DecisionTreeClassifier(random_state=0),
RandomForestClassifier(random_state=0),
XGBClassifier(random_state=0),
LogisticRegression(random_state=0, max_iter = 500),
SGDClassifier(random_state=0),
KNeighborsClassifier(),
GaussianNB()
#SVC()
]
def perf_table(x_train,y_train,x_test,y_test, Models):
# Create table to compare classifier metrics
Models_columns = ['Name', 'Time', 'Train_Accuracy', 'Test_Accuracy','Train_Precision', 'Test_Precision', 'Train_Recall', 'Test_Recall']
Models_compare = pd.DataFrame(columns = Models_columns)
# Choose scoring metrics:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
import time
# index through classifier and save performance to table
row_index = 0
for alg in Models:
# Set name and parameters
Models_name = alg.__class__.__name__
Models_compare.loc[row_index, 'Name'] = Models_name
# Measure the fit time for the model
start_time = time.time()
alg.fit(x_train, y_train)
fit_time = time.time() - start_time
Models_compare.loc[row_index, 'Time'] = fit_time
# Make predictions on the training and test data
y_train_pred = alg.predict(x_train)
y_test_pred = alg.predict(x_test)
# Calculate accuracy, precision, recall, and F1-score for the training and test predictions
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
train_precision = precision_score(y_train, y_train_pred)
test_precision = precision_score(y_test, y_test_pred)
train_recall = recall_score(y_train, y_train_pred)
test_recall = recall_score(y_test, y_test_pred)
train_f1 = f1_score(y_train, y_train_pred)
test_f1 = f1_score(y_test, y_test_pred)
Models_compare.loc[row_index, 'Train_Accuracy'] = train_accuracy
Models_compare.loc[row_index, 'Test_Accuracy'] = test_accuracy
Models_compare.loc[row_index, 'Train_Precision'] = train_precision
Models_compare.loc[row_index, 'Test_Precision'] = test_precision
Models_compare.loc[row_index, 'Train_Recall'] = train_recall
Models_compare.loc[row_index, 'Test_Recall'] = test_recall
row_index += 1
return Models_compare
Models_compare = perf_table(X_train_smenn_scaled, y_train_smenn,X_test_scaled, y_test, Models)
Model Assessment
Models_compare.sort_values(by = ['Test_Recall'], ascending = False, inplace = True)
Models_compare
| Name | Time | Train_Accuracy | Test_Accuracy | Train_Precision | Test_Precision | Train_Recall | Test_Recall | |
|---|---|---|---|---|---|---|---|---|
| 6 | GaussianNB | 0.111855 | 0.802813 | 0.570252 | 0.774703 | 0.150481 | 0.916617 | 0.802911 |
| 3 | LogisticRegression | 0.293891 | 0.858064 | 0.69456 | 0.864308 | 0.197398 | 0.887426 | 0.7703 |
| 4 | SGDClassifier | 0.6301 | 0.858366 | 0.689449 | 0.861451 | 0.192519 | 0.892121 | 0.756949 |
| 5 | KNeighborsClassifier | 0.030103 | 0.941718 | 0.717181 | 0.928646 | 0.195201 | 0.97119 | 0.676516 |
| 2 | XGBClassifier | 12.337272 | 0.93544 | 0.796188 | 0.94317 | 0.249956 | 0.942195 | 0.621361 |
| 1 | RandomForestClassifier | 42.236744 | 0.99997 | 0.793414 | 0.999947 | 0.23832 | 1.0 | 0.579996 |
| 0 | DecisionTreeClassifier | 1.889356 | 0.99997 | 0.779464 | 0.999947 | 0.207 | 1.0 | 0.504158 |
In this case, recall would probably be more important than precision or accuracy since there is a high cost associated with False Negative. Recall calculates how many of the Actual Positives our model capture through labeling it as Positive (True Positive), so we can detect those actual cases and possibly save lifes.
it is also worth a try of using usdersampling techniques to balance dataset.
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(data[cols], data['HeartDisease'], train_size=2/3, random_state = 0)
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler(sampling_strategy=0.9)
X_under, y_under = under_sampler.fit_resample(X_train, y_train)
fig, ax = plt.subplots(1, 1,figsize =(15, 8))
ax.pie(y_under.value_counts(),autopct='%1.1f%%',labels=['No',"Yes"],explode=[0.03 for i in data['HeartDisease'].value_counts().index])
ax.set_title('Post-Balanced Ratio of Heart Disease',weight = 'bold')
plt.legend(bbox_to_anchor=(1, 1))
fig.show()
print(f'After undersampling: {Counter(y_under)}')
After undersampling: Counter({0: 20136, 1: 18123})
<ipython-input-63-898bc5b69405>:5: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
Standardize
from sklearn.preprocessing import StandardScaler
#Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler().fit(X_under)
#transform the data such that its distribution will have a mean value 0 and standard deviation of 1
X_train_scaled = scaler.transform(X_under)
X_test_scaled = scaler.transform(X_test)
Models_compare = perf_table(X_train_scaled, y_under, X_test_scaled, y_test, Models)
Models_compare.sort_values(by = ['Test_Recall'], ascending = False, inplace = True)
Models_compare
| Name | Time | Train_Accuracy | Test_Accuracy | Train_Precision | Test_Precision | Train_Recall | Test_Recall | |
|---|---|---|---|---|---|---|---|---|
| 2 | XGBClassifier | 1.049253 | 0.805379 | 0.73815 | 0.778584 | 0.22431 | 0.823263 | 0.765594 |
| 3 | LogisticRegression | 0.033403 | 0.757861 | 0.761238 | 0.740016 | 0.238089 | 0.753573 | 0.739877 |
| 1 | RandomForestClassifier | 4.326863 | 0.997752 | 0.726786 | 0.997627 | 0.211708 | 0.997627 | 0.736923 |
| 4 | SGDClassifier | 0.242459 | 0.750176 | 0.763644 | 0.739099 | 0.236977 | 0.730453 | 0.721383 |
| 5 | KNeighborsClassifier | 0.0 | 0.803314 | 0.721725 | 0.786525 | 0.204032 | 0.802626 | 0.710987 |
| 6 | GaussianNB | 0.015665 | 0.718445 | 0.768725 | 0.727966 | 0.226275 | 0.64763 | 0.638761 |
| 0 | DecisionTreeClassifier | 0.159304 | 0.997778 | 0.679129 | 1.0 | 0.16647 | 0.99531 | 0.631757 |
Both oversampling and undersampling seem to have similar performance in test recall, so I choose undersampling here for less training time and i do not want synthetic data bias my prediction.
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import svm
# Create an instance of the SVM model with RBF kernel
clf = svm.SVC(kernel='rbf', gamma='scale', C=1)
# Fit the model to the training data
clf.fit(X_train_scaled, y_under)
# Predict on the test data
y_pred = clf.predict(X_test_scaled)
conf = confusion_matrix(y_test, y_pred)
clf_report = classification_report(y_test, y_pred)
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Confusion Matrix :
[[70558 20877]
[ 2473 6665]]
Classification Report :
precision recall f1-score support
0 0.97 0.77 0.86 91435
1 0.24 0.73 0.36 9138
accuracy 0.77 100573
macro avg 0.60 0.75 0.61 100573
weighted avg 0.90 0.77 0.81 100573
from sklearn.linear_model import LogisticRegression
# Create a logistic regression object
logreg = LogisticRegression()
# Fit the model to the data
logreg.fit(X_train_scaled, y_under)
y_pred_lg = logreg.predict(X_test_scaled)
conf = confusion_matrix(y_test, y_pred_lg)
clf_report = classification_report(y_test, y_pred_lg)
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Confusion Matrix :
[[71950 19485]
[ 2678 6460]]
Classification Report :
precision recall f1-score support
0 0.96 0.79 0.87 91435
1 0.25 0.71 0.37 9138
accuracy 0.78 100573
macro avg 0.61 0.75 0.62 100573
weighted avg 0.90 0.78 0.82 100573
xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.1, max_depth = 10, n_estimators = 180)
xgb.fit(X_train_scaled, y_under)
y_pred_xgb = xgb.predict(X_test_scaled)
conf = confusion_matrix(y_test, y_pred_xgb)
clf_report = classification_report(y_test, y_pred_xgb)
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Confusion Matrix :
[[67272 24163]
[ 2231 6907]]
Classification Report :
precision recall f1-score support
0 0.97 0.74 0.84 91435
1 0.22 0.76 0.34 9138
accuracy 0.74 100573
macro avg 0.60 0.75 0.59 100573
weighted avg 0.90 0.74 0.79 100573
tune the model for best recall
import xgboost as xgb
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
# Define XGBoost model
model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)
# Define parameter grid for GridSearchCV
param_grid = {
'learning_rate': [0.01, 0.1, 0.5],
'max_depth': [3, 5, 7],
'n_estimators': [100, 500, 1000],
'scale_pos_weight': [1, 3, 5]
}
# Perform GridSearchCV to find best parameters
grid_search = GridSearchCV(model, param_grid=param_grid, cv=5, scoring='recall')
grid_search.fit(X_train_scaled, y_under)
# Train XGBoost model with best parameters
best_model = xgb.XGBClassifier(**grid_search.best_params_)
best_model.fit(X_train_scaled, y_under)
# Make predictions on validation set
y_pred = best_model.predict(X_test_scaled)
# Calculate recall score on validation set,
recall = recall_score(y_test, y_pred)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Validation recall: {recall}")
rerun model with best parameter
best_params ={'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'scale_pos_weight': 5}
model = XGBClassifier(**best_params)
model.fit(X_train_scaled, y_under)
y_pred_xgb = model.predict(X_test_scaled)
conf = confusion_matrix(y_test, y_pred_xgb)
clf_report = classification_report(y_test, y_pred_xgb)
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Confusion Matrix :
[[31773 59662]
[ 306 8832]]
Classification Report :
precision recall f1-score support
0 0.99 0.35 0.51 91435
1 0.13 0.97 0.23 9138
accuracy 0.40 100573
macro avg 0.56 0.66 0.37 100573
weighted avg 0.91 0.40 0.49 100573
import pandas as pd
import matplotlib.pyplot as plt
# Create a pd.Series of features importances
importances_xgb = pd.Series(model.feature_importances_,
index = cols)
# Sort importances_rf
sorted_importances_xgb = importances_xgb.sort_values()[-7:]
# Make a horizontal bar plot
sorted_importances_xgb.plot(kind='barh', color='blue')
plt.show()
Some most important features are age, general health, physical health, sex,diabetic, stroke, smoking
Our final model will be tuned XGBoost due to its highest recall of 96%. That means we will be able to capture 96% of the patients who actually have heart disease. The challenge now is how to deal with low precision of the model.The model now seems hard to distinguish positive cases from negative cases. Increase the amount as well as diversity of data is definitely needed.
Our model also tells us that forming a good habit is definitely beneficial for our health. Giving up bad habits like smoking and do more physical exercises will reduce the risk of heart and vascular diseases by dozens of times.